import re
import os
import os.path

# Differences between 2 and 3...
try:
    from htmlentitydefs import name2codepoint
except ImportError:
    from html.entities import name2codepoint


TOC_HEADER = """
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<meta name="GENERATOR" content="Microsoft&reg; HTML Help Workshop 4.1">
<!-- Generated via preshrunk-cotton -->
<!-- Sitemap 1.0 -->
</HEAD><BODY>
<OBJECT type="text/site properties">
</OBJECT>
"""

TOC_FOOTER = "</BODY></HTML>"

FILE_ENTRY = """<LI> <OBJECT type="text/sitemap">
        <param name="Name" value="{name}">
        <param name="Local" value="{file}">
        <param name="ImageNumber" value="0">
    </OBJECT>
"""

class Indexer():
    """A default index class for handling a simple scan of the file system."""
    
    def __init__(self,directory):
        self.startdir = directory
        self.entries = []
    
    def get_headings_in_html(self,filename):
        """Searches an HTML file for any heading tags using some extra-simple
        regular expressions.  A list of headings sans HTML tags will be
        returned."""
        
        match = []
        pat = re.compile("<h[1-9]>(.*?)<\/h[1-9]>", re.DOTALL|re.IGNORECASE)
        fi = open(filename, 'r')
        for line in fi:
            match = match + pat.findall(line)
        fi.close()
        
        # Next, make sure that there is no HTML within the headings
        ret = []
        for x in match:
            
            # Remove tags, but keep inner contents
            x = re.sub("<.*?>", "", x)
            
            # Remove escaped characters
            x = re.sub('&(%s);' % '|'.join(name2codepoint), 
                       lambda m: chr(name2codepoint[m.group(1)]), x)
            
            # Trim the string
            x = x.strip()
            
            ret.append(x)
        
        return ret
    
    def process(self):
        """Walks the directory tree looking for any HTML files.  When found,
        the indexer will pull out any heading to be indexed."""
    
        for root, dirs, files in os.walk(self.startdir):
            
            rel = os.path.relpath(root, self.startdir)
            
            for entry in files:
            
                if entry.lower().endswith('.html'):
                
                    headings = self.get_headings_in_html(os.path.join(root, entry))
                    
                    filename = os.path.join(rel, entry)
                    if os.sep != '/':
                        filename = filename.replace(os.sep,'/')
                        
                    for x in headings:
                        self.entries.append((x,filename))
                        
        # Sort the index alphabetically by entry (not filename)
        self.entries.sort(key=lambda x: x[0])
                
    def output(self, filename):
        """Writes a proper index file to the filename listed."""
        
        ixf = open(filename, "w")
        ixf.write(TOC_HEADER)
        ixf.write("<UL>\n")
        for x in self.entries:
            ixf.write(FILE_ENTRY.format(name=x[0],
                                        file=x[1]))
        
        ixf.write("</UL>\n")
        ixf.write(TOC_FOOTER)
        ixf.close()
